
#####################################################
## F-statistic simulations
#####################################################
library(pacman)
library(LalRUtils) # install_github("apoorval/LalRUtils")
pacman::p_load(
  tidyverse, data.table, foreach, lfe, mvtnorm, ivmodel,
  doParallel, patchwork, tictoc
)
theme_set(theme_minimal())
set.seed(42)


# %% ####################################################
gen_clustered = function(π = c(0, 0.01), n_cluster = 50, n = 1000, ρ = .5) {
  # individual level
  Σ_i = matrix(c(1, 0, 0, 1 - ρ), ncol = 2)
  values_i = rmvnorm(n = n, sigma = Σ_i)
  # cluster level
  cluster_name = rep(1:n_cluster, each = n / n_cluster)
  Σ_cl = matrix(c(1, 0, 0, ρ), ncol = 2)
  values_cl = rmvnorm(n = n_cluster, sigma = Σ_cl)
  # predictor var consists of individual- and cluster-level components
  z = values_i[, 1] + rep(values_cl[, 1], each = n / n_cluster)
  # error consists of individual- and cluster-level components
  ε = values_i[, 2] + rep(values_cl[, 2], each = n / n_cluster)
  d = cbind(1, z) %*% π + ε
  df = data.frame(z, d, cluster = as.factor(cluster_name))
  return(df)
}

# %%
fstat = function(df, cl = 'cluster') {
  t = felm(formula_lfe('d', 'z', C = cl), df) %>%
    summary %>%
    .$coefficients %>%
    .[2, 3]
  t^2
}

boot_run = function(data, cl = 'cluster', cl0 = '0',
                     p_iv = 1, nboots = 100, seed = 94305, cores = 8) {
  n = nrow(data);
  if (cl != "0") {
    clusters = unique(data[, cl]); id.list = split(1:n, data[, cl]); ncl = length(clusters)
  }
  boot.core = function() {
    if (cl == "0") {
      # draw bootstrap sample
      smp = sample(1:n, n, replace = TRUE)
    } else {
      # block bootstrap
      cluster.boot = sample(clusters, ncl, replace = TRUE)
      smp = unlist(id.list[match(cluster.boot, clusters)]) # match to locate the position of the clusterin the list
    }
    s = data[smp, ]
    lm(d ~ z, s[-ncol(s)])$coefficients[-1]
  }
  doParallel::registerDoParallel(cores)
  coefs.boot = foreach(i = 1:nboots, .combine = rbind) %dopar% {
    boot.core()
  }
  fs_coefs0 = lm(d ~ z, data)$coefficients[-1]
  F_boot = c((t(fs_coefs0) %*% solve(var(coefs.boot)) %*% fs_coefs0) / p_iv)
  return(c(F_boot, fstat(data, cl0)))
}

# %%
scatter_Fs = function(π, n_cl, nr = 100, cl = "cluster", cl0 = "0") {
  fSims = replicate(nr, boot_run(gen_clustered(π, n_cluster = n_cl), cl = cl, cl0 = cl0))
  df = data.table(t(fSims)); names(df) = c("bootF", "analyticF")
  xmax = max(apply(df, 2, max))
  ggplot(df, aes(analyticF, bootF)) +
    xlim(c(0, xmax)) +
    ylim(c(0, xmax)) +
    geom_abline(intercept = 0, slope = 1, color = 'cornflowerblue', size = 1.5) +
    geom_point() +
    ggtitle(glue::glue("Coef = {π[2]}; n_cluster = {n_cl}"))
}

# %% cluster bootstrap F & non-clustered analytic F
tic()
p0 = scatter_Fs(c(0, 0.5), 50, cl0 = "0")
p1 = scatter_Fs(c(0, 0.5), 10, cl0 = "0")
p2 = scatter_Fs(c(0, 0.001), 50, cl0 = "0")
p3 = scatter_Fs(c(0, 0.001), 10, cl0 = "0")
toc()
(f0 = (p0 | p1) / (p2 | p3) +
  plot_annotation("Cluster-bootstrap F and (Non-Clustered) Robust Analytic F")
)



# %% cluster bootstrap F & cluster analytic F
tic()
p10 = scatter_Fs(c(0, 0.5), 50, cl0 = "cluster")
p11 = scatter_Fs(c(0, 0.5), 10, cl0 = "cluster")
p12 = scatter_Fs(c(0, 0.001), 50, cl0 = "cluster")
p13 = scatter_Fs(c(0, 0.001), 10, cl0 = "cluster")
toc()
(f1 = (p10 | p11) / (p12 | p13) +
  plot_annotation("Cluster-bootstrap F and clustered robust analytic F")
)

ggsave('graphs/FigureA5a.pdf', f0, width = 10)
ggsave('graphs/FigureA5b.pdf', f1, width = 10)


